# download directly off tidytuesdaygithub repo
rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')(1) What are the variable types? (2) Do they all correspond to what they really are? (3) Which variables have most missing values?
(1) The following variables are characters:
The following variables are numeric, more specifically, double-precision floating-point:
(3) The variable “descr” has the most missing values (~198k), followed by “address” (~197k) and “lon” (~196k)
# (1)
glimpse(rent)## Rows: 200,796
## Columns: 17
## $ post_id <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5, NA, …
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title <chr> "$1250 / 2br - 2BR/2BA 1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
# (2)
glimpse(rent)## Rows: 200,796
## Columns: 17
## $ post_id <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5, NA, …
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title <chr> "$1250 / 2br - 2BR/2BA 1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
# (3)
skimr::skim(rent)| Name | rent |
| Number of rows | 200796 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| post_id | 0 | 1.00 | 9 | 14 | 0 | 200796 | 0 |
| nhood | 0 | 1.00 | 4 | 43 | 0 | 167 | 0 |
| city | 0 | 1.00 | 5 | 19 | 0 | 104 | 0 |
| county | 1394 | 0.99 | 4 | 13 | 0 | 10 | 0 |
| address | 196888 | 0.02 | 1 | 38 | 0 | 2869 | 0 |
| title | 2517 | 0.99 | 2 | 298 | 0 | 184961 | 0 |
| descr | 197542 | 0.02 | 13 | 16975 | 0 | 3025 | 0 |
| details | 192780 | 0.04 | 4 | 595 | 0 | 7667 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| date | 0 | 1.00 | 2.01e+07 | 44694.07 | 2.00e+07 | 2.01e+07 | 2.01e+07 | 2.01e+07 | 2.02e+07 | ▁▇▁▆▃ |
| year | 0 | 1.00 | 2.01e+03 | 4.48 | 2.00e+03 | 2.00e+03 | 2.01e+03 | 2.01e+03 | 2.02e+03 | ▁▇▁▆▃ |
| price | 0 | 1.00 | 2.14e+03 | 1427.75 | 2.20e+02 | 1.30e+03 | 1.80e+03 | 2.50e+03 | 4.00e+04 | ▇▁▁▁▁ |
| beds | 6608 | 0.97 | 1.89e+00 | 1.08 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 3.00e+00 | 1.20e+01 | ▇▂▁▁▁ |
| baths | 158121 | 0.21 | 1.68e+00 | 0.69 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 8.00e+00 | ▇▁▁▁▁ |
| sqft | 136117 | 0.32 | 1.20e+03 | 5000.22 | 8.00e+01 | 7.50e+02 | 1.00e+03 | 1.36e+03 | 9.00e+05 | ▇▁▁▁▁ |
| room_in_apt | 0 | 1.00 | 0.00e+00 | 0.04 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.00e+00 | ▇▁▁▁▁ |
| lat | 193145 | 0.04 | 3.77e+01 | 0.35 | 3.36e+01 | 3.74e+01 | 3.78e+01 | 3.78e+01 | 4.04e+01 | ▁▁▅▇▁ |
| lon | 196484 | 0.02 | -1.22e+02 | 0.78 | -1.23e+02 | -1.22e+02 | -1.22e+02 | -1.22e+02 | -7.42e+01 | ▇▁▁▁▁ |
Make a plot that shows the top 20 cities in terms of % of classifieds between 2000-2018. You need to calculate the number of listings by city, and then convert that number to a %
# YOUR CODE GOES HERE
total_number_of_listings <- dim(rent)[1] # count how many listings there are in the dataset
city_rent <- rent %>%
group_by(city) %>% # group by city
summarise(total_city_count = n()) %>% # count how many offers there are per city
mutate(total_city_count_percent = (total_city_count / total_number_of_listings) ) %>% # calculate the percentage of listings for each city
arrange(desc(total_city_count_percent)) %>% # arrange in descending order
slice_max(order_by = total_city_count_percent, n = 20) %>% # Pick the first 20
ggplot(aes(x = total_city_count_percent, # create plot, x-axis is the above calculated percentage of listings
y = fct_reorder(city, total_city_count_percent))) + # y-axis represents the cities reordered so that the city with highest listings comes at the top
geom_col() + # Create a bar plot
labs(title = "San Francisco accounts for more than a quarter of all rental classifieds", # add title, subtitle, caption
subtitle = "% of Craigslist listings, 2000-2018",
y = NULL,
x = NULL,
caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") +
scale_x_continuous(labels = scales::percent) # make x-axis show percentages
city_rentMake a plot that shows the evolution of median prices in San Francisco for 0, 1, 2, and 3 bedrooms listings
# YOUR CODE GOES HERE
sf_rent <- rent %>%
filter (city == "san francisco", beds <= 3) %>% # filter for listings in San Francisco with <= 3 beds
group_by(beds, year) %>% # group by number of beds and year
summarise(median_price = median(price)) # calculate the median price for each combination of beds and year
p <- ggplot(sf_rent, aes(x = year, y = median_price, # create plot
color = as.character(beds))) + # color the plot according to number of beds, transform "beds" into character to create distinct groups
geom_line() + # create a line plot
facet_wrap(~beds, nrow = 1) + # create one plot for each number of beds, specify nrow to 1 to ensure that plots are next to each other
labs(title = "San Francisco rents have been steadily increasing", # add title, subtitle, caption
subtitle = "0 to 3-bed listings, 2000-2018",
x = NULL, y = NULL,
caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") +
theme(legend.position="none") # remove legend
pFinally, make a plot that shows median rental prices for the top 12 cities in the Bay area.
# YOUR CODE GOES HERE
top_12 <- rent %>%
filter(beds == 1) %>% # filter for 1-bedroom listings
group_by(city) %>% # group by city
summarise(sum_rental_price = sum(price)) %>% # calculate the sum of all listings per city
arrange(desc(sum_rental_price)) %>% # arrange in descending order according to the above-calculated sum
slice_max(order_by = sum_rental_price, n = 12) # take the top 12 cities
rent_select <- rent[rent$city %in% top_12$city,] # select the top 12 cities from the overall data set
one_bedroom_rent <- rent_select %>%
filter(beds == 1) %>% # filter for 1-bedroom listings
group_by(city, year) %>% # group by city and year
summarise(median_rental_price = median(price)) # calculate the median price per city and year
one_bedroom_rent %>%
ggplot(aes(x = year, y = median_rental_price, # create plot
color = as.character(city))) + # specify that each city gets distinct color
geom_line() + # create a line plot
facet_wrap(~city) + # create one plot for each city
labs(title = "Rental prices for 1-bedroom flats in the Bay Area", # specify title and caption
x = NULL, y = NULL,
caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") +
theme(legend.position="none") # remove legend What can you infer from these plots? Don’t just explain what’s in the graph, but speculate or tell a short story (1-2 paragraphs max).
Prices in all top 12 cities have heavily increased from 2000 until 2018. The reason for that might be the continuing success of the tech sector in the Bay Area. This leads to exorbitant wages and therefore a lot of pressure on the housing market.
However, the increase is interrupted by years in which prices decreased. One such decrease is prevalent from ~2000-2005 which corresponds with the burst of the dot-com bubble. In 2002, the Nasdaq fell 78% from its peak which led to many online shopping companies to shut down. Many of these companies were headquartered in the Bay Area, which probably led to more unemployment and therefore lower pressure on the housing market. The second dent appeared ~2009. In 2009, the financial crisis dominated the headlines and unemployment rose, decreasing housing prices. Interestingly, there is a third dent around 2017-2018. However, NASDAQ rose during that period, therefore, we don’t think that the reason can be found in the overall economy. We assume that with housing prices more than doubling in some cities, people started moving away from the Bay Area and maybe working remotely in other places. There might be a trend behind this, heralded by Elon Musk who moved Tesla’s headquarter away from the Bay Area and to Texas.
movies <- read_csv(here::here("data", "movies.csv"))
glimpse(movies)## Rows: 2,961
## Columns: 11
## $ title <chr> "Avatar", "Titanic", "Jurassic World", "The Avenge…
## $ genre <chr> "Action", "Drama", "Action", "Action", "Action", "…
## $ director <chr> "James Cameron", "James Cameron", "Colin Trevorrow…
## $ year <dbl> 2009, 1997, 2015, 2012, 2008, 1999, 1977, 2015, 20…
## $ duration <dbl> 178, 194, 124, 173, 152, 136, 125, 141, 164, 93, 1…
## $ gross <dbl> 7.61e+08, 6.59e+08, 6.52e+08, 6.23e+08, 5.33e+08, …
## $ budget <dbl> 2.37e+08, 2.00e+08, 1.50e+08, 2.20e+08, 1.85e+08, …
## $ cast_facebook_likes <dbl> 4834, 45223, 8458, 87697, 57802, 37723, 13485, 920…
## $ votes <dbl> 886204, 793059, 418214, 995415, 1676169, 534658, 9…
## $ reviews <dbl> 3777, 2843, 1934, 2425, 5312, 3917, 1752, 1752, 35…
## $ rating <dbl> 7.9, 7.7, 7.0, 8.1, 9.0, 6.5, 8.7, 7.5, 8.5, 7.2, …
skimr::skim(movies)| Name | movies |
| Number of rows | 2961 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 83 | 0 | 2907 | 0 |
| genre | 0 | 1 | 5 | 11 | 0 | 17 | 0 |
| director | 0 | 1 | 3 | 32 | 0 | 1366 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2.00e+03 | 9.95e+00 | 1920.0 | 2.00e+03 | 2.00e+03 | 2.01e+03 | 2.02e+03 | ▁▁▁▂▇ |
| duration | 0 | 1 | 1.10e+02 | 2.22e+01 | 37.0 | 9.50e+01 | 1.06e+02 | 1.19e+02 | 3.30e+02 | ▃▇▁▁▁ |
| gross | 0 | 1 | 5.81e+07 | 7.25e+07 | 703.0 | 1.23e+07 | 3.47e+07 | 7.56e+07 | 7.61e+08 | ▇▁▁▁▁ |
| budget | 0 | 1 | 4.06e+07 | 4.37e+07 | 218.0 | 1.10e+07 | 2.60e+07 | 5.50e+07 | 3.00e+08 | ▇▂▁▁▁ |
| cast_facebook_likes | 0 | 1 | 1.24e+04 | 2.05e+04 | 0.0 | 2.24e+03 | 4.60e+03 | 1.69e+04 | 6.57e+05 | ▇▁▁▁▁ |
| votes | 0 | 1 | 1.09e+05 | 1.58e+05 | 5.0 | 1.99e+04 | 5.57e+04 | 1.33e+05 | 1.69e+06 | ▇▁▁▁▁ |
| reviews | 0 | 1 | 5.03e+02 | 4.94e+02 | 2.0 | 1.99e+02 | 3.64e+02 | 6.31e+02 | 5.31e+03 | ▇▁▁▁▁ |
| rating | 0 | 1 | 6.39e+00 | 1.05e+00 | 1.6 | 5.80e+00 | 6.50e+00 | 7.10e+00 | 9.30e+00 | ▁▁▆▇▁ |
Are there any missing values (NAs)? Are all entries distinct or are there duplicate entries? After using the skim function on the dataset, we can conclude that there are no missing values. The distinct function shows that all entries in the dataset are distinct
Produce a table with the count of movies by genre, ranked in descending order
skim(movies) # no, there are no missing values| Name | movies |
| Number of rows | 2961 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 83 | 0 | 2907 | 0 |
| genre | 0 | 1 | 5 | 11 | 0 | 17 | 0 |
| director | 0 | 1 | 3 | 32 | 0 | 1366 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2.00e+03 | 9.95e+00 | 1920.0 | 2.00e+03 | 2.00e+03 | 2.01e+03 | 2.02e+03 | ▁▁▁▂▇ |
| duration | 0 | 1 | 1.10e+02 | 2.22e+01 | 37.0 | 9.50e+01 | 1.06e+02 | 1.19e+02 | 3.30e+02 | ▃▇▁▁▁ |
| gross | 0 | 1 | 5.81e+07 | 7.25e+07 | 703.0 | 1.23e+07 | 3.47e+07 | 7.56e+07 | 7.61e+08 | ▇▁▁▁▁ |
| budget | 0 | 1 | 4.06e+07 | 4.37e+07 | 218.0 | 1.10e+07 | 2.60e+07 | 5.50e+07 | 3.00e+08 | ▇▂▁▁▁ |
| cast_facebook_likes | 0 | 1 | 1.24e+04 | 2.05e+04 | 0.0 | 2.24e+03 | 4.60e+03 | 1.69e+04 | 6.57e+05 | ▇▁▁▁▁ |
| votes | 0 | 1 | 1.09e+05 | 1.58e+05 | 5.0 | 1.99e+04 | 5.57e+04 | 1.33e+05 | 1.69e+06 | ▇▁▁▁▁ |
| reviews | 0 | 1 | 5.03e+02 | 4.94e+02 | 2.0 | 1.99e+02 | 3.64e+02 | 6.31e+02 | 5.31e+03 | ▇▁▁▁▁ |
| rating | 0 | 1 | 6.39e+00 | 1.05e+00 | 1.6 | 5.80e+00 | 6.50e+00 | 7.10e+00 | 9.30e+00 | ▁▁▆▇▁ |
print(paste("Number of duplicate values: ", sum(duplicated(movies)))) # no, there are no duplicate entries## [1] "Number of duplicate values: 0"
movies_count_list <- movies %>% # assigning a variable
group_by(genre) %>% # grouping the movie dataset by genre
summarize(movies_count = n()) %>% # using the summarise function to count the no of movies in each genre
arrange(desc(movies_count)) #a rranging the result in descsending order on the basis of movies_count
movies_count_list## # A tibble: 17 × 2
## genre movies_count
## <chr> <int>
## 1 Comedy 848
## 2 Action 738
## 3 Drama 498
## 4 Adventure 288
## 5 Crime 202
## 6 Biography 135
## 7 Horror 131
## 8 Animation 35
## 9 Fantasy 28
## 10 Documentary 25
## 11 Mystery 16
## 12 Sci-Fi 7
## 13 Family 3
## 14 Musical 2
## 15 Romance 2
## 16 Western 2
## 17 Thriller 1
gross and budget) by genre. Calculate a
variable return_on_budget which shows how many $ did a
movie make at the box office for each $ of its budget. Rank genres by
this return_on_budget in descending order movies %>% #the movie dataset
group_by(genre) %>% #Grouping the dataset by genre
summarize(avg_gross_earning = mean(gross), #using the summarise function to calculate mean gross earnings
avg_budget = mean(budget), #calculating the mean budget
return_on_budget = avg_gross_earning/avg_budget) %>% #creating a new variable and assigning mean gross/mean budget to it
arrange(desc(return_on_budget)) #arranging the result by the new variable## # A tibble: 17 × 4
## genre avg_gross_earning avg_budget return_on_budget
## <chr> <dbl> <dbl> <dbl>
## 1 Musical 92084000 3189500 28.9
## 2 Family 149160478. 14833333. 10.1
## 3 Western 20821884 3465000 6.01
## 4 Documentary 17353973. 5887852. 2.95
## 5 Horror 37713738. 13504916. 2.79
## 6 Fantasy 42408841. 17582143. 2.41
## 7 Comedy 42630552. 24446319. 1.74
## 8 Mystery 67533021. 39218750 1.72
## 9 Animation 98433792. 61701429. 1.60
## 10 Biography 45201805. 28543696. 1.58
## 11 Adventure 95794257. 66290069. 1.45
## 12 Drama 37465371. 26242933. 1.43
## 13 Crime 37502397. 26596169. 1.41
## 14 Romance 31264848. 25107500 1.25
## 15 Action 86583860. 71354888. 1.21
## 16 Sci-Fi 29788371. 27607143. 1.08
## 17 Thriller 2468 300000 0.00823
movies %>% #movie dataset
group_by(director) %>% #grouping by director
summarize(total_gross_amount = sum(gross), #using summarise fn to find the total_gross_amount
mean_gross_amount = mean(gross), #using the summarise fn to find mean_gross_amount
median_gross_amount = median(gross), #using the summarise fn to find median_gross_amount
standard_deviation_gross_amount = sd(gross)) %>% #using the summarise fn to find sd_gross_amount
arrange(desc(total_gross_amount)) %>% #arranging in descending orer by total_gross_amount
slice_max(order_by = total_gross_amount, n=15) #displaying the top 15 values using slice_max fn## # A tibble: 15 × 5
## director total_gross_amount mean_gross_amount median_gross…¹ stand…²
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Steven Spielberg 4014061704 174524422. 164435221 1.01e8
## 2 Michael Bay 2231242537 171634041. 138396624 1.27e8
## 3 Tim Burton 2071275480 129454718. 76519172 1.09e8
## 4 Sam Raimi 2014600898 201460090. 234903076 1.62e8
## 5 James Cameron 1909725910 318287652. 175562880. 3.09e8
## 6 Christopher Nolan 1813227576 226653447 196667606. 1.87e8
## 7 George Lucas 1741418480 348283696 380262555 1.46e8
## 8 Robert Zemeckis 1619309108 124562239. 100853835 9.13e7
## 9 Clint Eastwood 1378321100 72543216. 46700000 7.55e7
## 10 Francis Lawrence 1358501971 271700394. 281666058 1.35e8
## 11 Ron Howard 1335988092 111332341 101587923 8.19e7
## 12 Gore Verbinski 1329600995 189942999. 123207194 1.54e8
## 13 Andrew Adamson 1137446920 284361730 279680930. 1.21e8
## 14 Shawn Levy 1129750988 102704635. 85463309 6.55e7
## 15 Ridley Scott 1128857598 80632686. 47775715 6.88e7
## # … with abbreviated variable names ¹median_gross_amount,
## # ²standard_deviation_gross_amount
movies_ratings_by_genre <- movies %>% #assigning the custom table to a variable
group_by(genre) %>% #grouping the movie dataset by genre
summarize(mean_rating = mean(rating), #using the summarise fn to find mean rating
min_rating = min(rating), #using the summarise fn to find minimum rating
max_rating = max(rating), #using the summarise fn to find maximum rating
median_rating = median(rating), #using the summarise fn to find median rating
sd_rating = sd(rating)) %>% #using the summarise fn to find standard deviation of rating
arrange(desc(mean_rating)) #arranging the result in descending order by mean_rating
movies_ratings_by_genre## # A tibble: 17 × 6
## genre mean_rating min_rating max_rating median_rating sd_rating
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Biography 7.11 4.5 8.9 7.2 0.760
## 2 Crime 6.92 4.8 9.3 6.9 0.849
## 3 Mystery 6.86 4.6 8.5 6.9 0.882
## 4 Musical 6.75 6.3 7.2 6.75 0.636
## 5 Drama 6.73 2.1 8.8 6.8 0.917
## 6 Documentary 6.66 1.6 8.5 7.4 1.77
## 7 Sci-Fi 6.66 5 8.2 6.4 1.09
## 8 Animation 6.65 4.5 8 6.9 0.968
## 9 Romance 6.65 6.2 7.1 6.65 0.636
## 10 Adventure 6.51 2.3 8.6 6.6 1.09
## 11 Family 6.5 5.7 7.9 5.9 1.22
## 12 Action 6.23 2.1 9 6.3 1.03
## 13 Fantasy 6.15 4.3 7.9 6.45 0.959
## 14 Comedy 6.11 1.9 8.8 6.2 1.02
## 15 Horror 5.83 3.6 8.5 5.9 1.01
## 16 Western 5.7 4.1 7.3 5.7 2.26
## 17 Thriller 4.8 4.8 4.8 4.8 NA
ggplot(movies, aes(x = rating)) + #plotting a histogram using movie dataset with rating on the x-axis
geom_density() + #this specifies that the plot is a density plot
facet_wrap(~genre) + #facet_wrap by genre to see how ratings are distributed among genre
labs(title = "Distribution of ratings by genre",
x = "Rating", y = NULL)ggplot to answer the followinggross and
cast_facebook_likes. Produce a scatterplot and write one
sentence discussing whether the number of facebook likes that the cast
has received is likely to be a good predictor of how much money a movie
will make at the box office. What variable are you going to map to the
Y- and X- axes?# create a plot with the cast's facebook likes on the x-axis and the gross amount on the y-axis. The reason for the variable assignment to the axes is that we want to examine how the facebook likes affect the gross, not the other way around. The affected variable usually goes on the y-axis
ggplot(movies, aes(x=cast_facebook_likes, y=gross)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) + # add line of best fit according to a linear regression model
labs(title = "Relationship between gross amount and cast's facebook likes",
x = "Cast facebook likes",
y = "Gross amount")As observed from the scatterplot, the general consensus seems to be that the number of likes is a good measure of how well a movie will do at the box office.This could be because most of the people go through the no of likes that a movie has recieved on facebook and then decide whether to watch the movie or not. The trend shows that most of the movies that have recieved higher number of likes have also done well at the box office. We have plotted the cast_facebook_likes on x-axis and the gross earnings on the y-axis
gross and
budget. Produce a scatterplot and write one sentence
discussing whether budget is likely to be a good predictor of how much
money a movie will make at the box office.#using movie dataset to find the correlation between budget and gross earnings of movies. geom_point is used to create a scatterplot and the axis have been scaled to visualise the results better
ggplot(movies, aes(x=budget,y=gross)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) + # add line of best fit according to a linear regression model
labs(title = "Relationship between a movies' gross amount and budget",
x = "Budget", y = "Gross amount")As observed from the scatterplot, movies with higher budget are more likely to do well at the box office apart from some outliers.
-Examine the relationship between gross and
rating. Produce a scatterplot, faceted by
genre and discuss whether IMDB ratings are likely to be a
good predictor of how much money a movie will make at the box office. Is
there anything strange in this dataset?
For documentary and sci-fi movies, we have a negative correlation between the gross amount and the rating. Therefore, this dataset suggests that the more one spends in producing documentaries and sci-fi movies, the worse the rating is going to get. However, one has to point out that our samples for both documentary and sci-fi movies are small. Therefore, we can’t be certain that the sample correlation is also prevalent in the entire population.
#This chunk of code is used to plot the correlation between gross and rating across various genres
ggplot(movies, aes(y = gross, x = rating)) +
geom_point() +
facet_wrap(~genre) +
geom_smooth(method = "lm", se = FALSE) + # add line of best fit according to a linear regression model
labs(title = "Relationship between a movies' gross amount anbd rating",
y = "Gross amount", x = "Rating")nyse <- read_csv(here::here("data","nyse.csv"))Based on this dataset, create a table and a bar plot that shows the number of companies per sector, in descending order
# YOUR CODE GOES HERE
com_sector_nyse <- nyse %>%
group_by(sector) %>% #group by sector
summarise(num_com_per_sec = count(sector)) %>% #count sector
arrange(desc(num_com_per_sec)) #arrange in descending order
com_sector_nyse## # A tibble: 12 × 2
## sector num_com_per_sec
## <chr> <int>
## 1 Finance 97
## 2 Consumer Services 79
## 3 Public Utilities 60
## 4 Capital Goods 45
## 5 Health Care 45
## 6 Energy 42
## 7 Technology 40
## 8 Basic Industries 39
## 9 Consumer Non-Durables 31
## 10 Miscellaneous 12
## 11 Transportation 10
## 12 Consumer Durables 8
com_sector_nyse %>%
ggplot(aes(x = fct_reorder(sector,desc(num_com_per_sec)), y = num_com_per_sec)) + #Plot in descending order
geom_col() +
labs(title = "Number of companies per sector",
x = "Sector name",
y = "Numbers of companies")Next, let’s choose some stocks and their ticker symbols and download
some data. You MUST choose 6 different stocks from the
ones listed below; You should, however, add SPY which is
the SP500 ETF (Exchange Traded Fund).
# Notice the cache=TRUE argument inthe chunk options. Because getting data is time consuming,
# cache=TRUE means that once it downloads data, the chunk will not run again next time you knit your Rmd
myStocks <- c("AAPL","JPM","DIS","DPZ","ANF","TSLA","SPY" ) %>% #select the six stocks
tq_get(get = "stock.prices",
from = "2011-01-01",
to = "2022-08-31") %>%
group_by(symbol)
glimpse(myStocks) # examine the structure of the resulting data frame## Rows: 20,545
## Columns: 8
## Groups: symbol [7]
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL…
## $ date <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06, 2011-01-07, …
## $ open <dbl> 11.6, 11.9, 11.8, 12.0, 11.9, 12.1, 12.3, 12.3, 12.3, 12.4, 1…
## $ high <dbl> 11.8, 11.9, 11.9, 12.0, 12.0, 12.3, 12.3, 12.3, 12.4, 12.4, 1…
## $ low <dbl> 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.3, 1…
## $ close <dbl> 11.8, 11.8, 11.9, 11.9, 12.0, 12.2, 12.2, 12.3, 12.3, 12.4, 1…
## $ volume <dbl> 4.45e+08, 3.09e+08, 2.56e+08, 3.00e+08, 3.12e+08, 4.49e+08, 4…
## $ adjusted <dbl> 10.05, 10.10, 10.18, 10.18, 10.25, 10.44, 10.42, 10.50, 10.54…
Financial performance analysis depend on returns; If I buy a stock today for 100 and I sell it tomorrow for 101.75, my one-day return, assuming no transaction costs, is 1.75%. So given the adjusted closing prices, our first step is to calculate daily and monthly returns.
#calculate daily returns
myStocks_returns_daily <- myStocks %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
type = "log",
col_rename = "daily_returns",
cols = c(nested.col))
#calculate monthly returns
myStocks_returns_monthly <- myStocks %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "monthly",
type = "arithmetic",
col_rename = "monthly_returns",
cols = c(nested.col))
#calculate yearly returns
myStocks_returns_annual <- myStocks %>%
group_by(symbol) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "yearly",
type = "arithmetic",
col_rename = "yearly_returns",
cols = c(nested.col))Create a table where you summarise monthly returns for each of the
stocks and SPY; min, max, median, mean, SD.
# YOUR CODE GOES HERE
Monthly_return <- myStocks_returns_monthly %>%
group_by(symbol) %>% # group by symbol
summarise(min = min(monthly_returns),
max = max(monthly_returns),
median = median(monthly_returns),
mean = mean(monthly_returns),
SD = sd(monthly_returns)) # min, max, median, mean, SD
Monthly_return## # A tibble: 7 × 6
## symbol min max median mean SD
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL -0.181 0.217 0.0230 0.0230 0.0791
## 2 ANF -0.421 0.507 0.00105 0.00337 0.146
## 3 DIS -0.186 0.234 0.00725 0.0113 0.0721
## 4 DPZ -0.194 0.342 0.0246 0.0270 0.0774
## 5 JPM -0.229 0.202 0.0199 0.0119 0.0727
## 6 SPY -0.125 0.127 0.0146 0.0106 0.0404
## 7 TSLA -0.224 0.811 0.0117 0.0501 0.177
Plot a density plot, using geom_density(), for each of
the stocks
# We created one plot that includes all stocks to facilitate comparisons
ggplot(myStocks_returns_monthly,
aes(x = monthly_returns, color = symbol)) +
geom_density() + # plot density function
labs(title = "Density plot of monthly returns for selected stocks",
x = "Monthly returns", y = "Density")What can you infer from this plot? Which stock is the riskiest? The least risky?
Infer from this plot
The riskiest stock
The least risky stock While DIS has the most concentrated distribution so it might be the first choice for conservative investors.
Finally, make a plot that shows the expected monthly return
(mean) of a stock on the Y axis and the risk (standard deviation) in the
X-axis. Please use ggrepel::geom_text_repel() to label each
stock
# create the plot
ggplot(Monthly_return, aes(x=SD, y=mean)) +
geom_point() +
geom_text_repel(aes(label = symbol),
nudge_x = 1,
na.rm = TRUE) +
labs(title = "Examining the relationship between a stock's standard deviation and its monthly return",
x = "Standard deviation",
y = "Expected monthly return")What can you infer from this plot? Are there any stocks which, while being riskier, do not have a higher expected return?
TYPE YOUR ANSWER AFTER (AND OUTSIDE!) THIS BLOCKQUOTE.
Generally speaking, higher risk brings higher return. This is because investors want to be reimbursed for the risk they are taking.
However, Abercrombie & Fitch has a high standard deviation but low expected return. One can infer that Abercrombie & Fitch as a company wasn’t very successful during this time period.
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')Produce a one-page summary describing this dataset. Here is a non-exhaustive list of questions:
track_popularity). Does it look like a Normal
distribution?# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)
ggplot(spotify_songs_unique, aes(x = track_popularity)) + geom_histogram() +
labs(title = "Distribution of track_popularity",
x = "track_popularity",
y = "frequency")Even though it’s not a perfect bell curve, the distribution of track_popularity closely resembles a Normal distribution curve. However, the number of songs with zero popularity is high in number, causing a high spike in the curve.
acousticness, liveness,
speechinessand instrumentalness, perceptual
measures like energy, loudness,
danceability and valence (positiveness), and
descriptors like duration, tempo,
key, and mode. How are they distributed? can
you roughly guess which of these variables is closer to Normal just by
looking at summary statistics?# Initial analysis is done here, using the summary of given spotify data
spotify_songs_summary <- summary(spotify_songs)
# Use knitr library for adding titles and scrollbar to the table
knitr::kable(spotify_songs_summary, "html") %>% kable_styling("striped") %>%
kableExtra::scroll_box(width = "100%", height = "100%")| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_name | playlist_id | playlist_genre | playlist_subgenre | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:32833 | Length:32833 | Length:32833 | Min. : 0.0 | Length:32833 | Length:32833 | Length:32833 | Length:32833 | Length:32833 | Length:32833 | Length:32833 | Min. :0.000 | Min. :0.000 | Min. : 0.00 | Min. :-46.4 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. : 0 | Min. : 4000 | |
| Class :character | Class :character | Class :character | 1st Qu.: 24.0 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:0.563 | 1st Qu.:0.581 | 1st Qu.: 2.00 | 1st Qu.: -8.2 | 1st Qu.:0.000 | 1st Qu.:0.041 | 1st Qu.:0.015 | 1st Qu.:0.000 | 1st Qu.:0.093 | 1st Qu.:0.331 | 1st Qu.:100 | 1st Qu.:187819 | |
| Mode :character | Mode :character | Mode :character | Median : 45.0 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :0.672 | Median :0.721 | Median : 6.00 | Median : -6.2 | Median :1.000 | Median :0.062 | Median :0.080 | Median :0.000 | Median :0.127 | Median :0.512 | Median :122 | Median :216000 | |
| NA | NA | NA | Mean : 42.5 | NA | NA | NA | NA | NA | NA | NA | Mean :0.655 | Mean :0.699 | Mean : 5.37 | Mean : -6.7 | Mean :0.566 | Mean :0.107 | Mean :0.175 | Mean :0.085 | Mean :0.190 | Mean :0.511 | Mean :121 | Mean :225800 | |
| NA | NA | NA | 3rd Qu.: 62.0 | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:0.761 | 3rd Qu.:0.840 | 3rd Qu.: 9.00 | 3rd Qu.: -4.6 | 3rd Qu.:1.000 | 3rd Qu.:0.132 | 3rd Qu.:0.255 | 3rd Qu.:0.005 | 3rd Qu.:0.248 | 3rd Qu.:0.693 | 3rd Qu.:134 | 3rd Qu.:253585 | |
| NA | NA | NA | Max. :100.0 | NA | NA | NA | NA | NA | NA | NA | Max. :0.983 | Max. :1.000 | Max. :11.00 | Max. : 1.3 | Max. :1.000 | Max. :0.918 | Max. :0.994 | Max. :0.994 | Max. :0.996 | Max. :0.991 | Max. :239 | Max. :517810 |
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)
# Plot each column to accurately determine the distribution of the data.
ggplot(spotify_songs_unique, aes(x = acousticness)) + geom_histogram() +
labs(title = "Distribution of acousticness",
x = "acousticness",
y = "frequency")ggplot(spotify_songs_unique, aes(x = liveness)) + geom_histogram() +
labs(title = "Distribution of liveness",
x = "liveness",
y = "frequency")ggplot(spotify_songs_unique, aes(x = speechiness)) + geom_histogram() +
labs(title = "Distribution of speechiness",
x = "speechiness",
y = "frequency")ggplot(spotify_songs_unique, aes(x = instrumentalness)) + geom_histogram() +
labs(title = "Distribution of instrumentalness",
x = "instrumentalness",
y = "frequency")ggplot(spotify_songs_unique, aes(x = energy)) + geom_histogram() +
labs(title = "Distribution of energy",
x = "energy",
y = "frequency")ggplot(spotify_songs_unique, aes(x = loudness)) + geom_histogram() +
labs(title = "Distribution of loudness",
x = "loudness",
y = "frequency")ggplot(spotify_songs_unique, aes(x = danceability)) + geom_histogram() +
labs(title = "Distribution of danceability",
x = "danceability",
y = "frequency")ggplot(spotify_songs_unique, aes(x = valence)) + geom_histogram() +
labs(title = "Distribution of valence",
x = "valence",
y = "frequency")ggplot(spotify_songs_unique, aes(x = duration_ms)) + geom_histogram() +
labs(title = "Distribution of duration_ms",
x = "duration_ms",
y = "frequency")ggplot(spotify_songs_unique, aes(x = tempo)) + geom_histogram() +
labs(title = "Distribution of tempo",
x = "tempo",
y = "frequency")A distribution is normal when it is distributed along a bell curve and the mode, mean and median are the same. In our opinion it is difficult to say just from the summary statistics whether a distribution is normal because we don’t have entire context of data. Even if a variable might have the same mean and median it doesn’t mean that it will necessarily be distributed normally.
In order to see whether a variable is normally distributed, we created histograms for the 12 variables. From these histograms we can see that some variables seem to be more normally distributed which goes hand in hand with the summary stats showing that when the mean and the median have lower difference, the curve has higher chance of being a normal curve.
valence and
track_popularity? danceability and
track_popularity ?# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)
# Draw the scatter plot of valence and track_popularity, to understand the correlation.
track_popularity_and_valence <- ggplot(spotify_songs_unique, aes(x = valence, y = track_popularity)) +
geom_point() +
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95) +
labs(title = "Scatter plot of valence and track_popularity",
x = "valence",
y = "track_popularity")
track_popularity_and_valence# Draw the scatter plot of danceability and track_popularity, to understand the correlation.
track_popularity_and_danceability <- ggplot(spotify_songs_unique, aes(x = danceability, y = track_popularity)) +
geom_point() +
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95) +
labs(title = "Scatter plot of danceability and track_popularity",
x = "danceability",
y = "track_popularity")
track_popularity_and_danceabilitySo, according to our scatter plots we can see that there doesn’t seem to be a correlation between valence and track popularity or danceability and track popularity.
Valence and track_popularity have a straight line. Hence, no matter what the valence value is, the track popularity stays the same.
This is also the case for danceability and track popularity. Even though there might be a small increase in popularity when the track is more danceable, however its not statistically signifigant.
mode indicates the modality (major or minor) of a
track, the type of scale from which its melodic content is derived.
Major is represented by 1 and minor is 0. Do songs written on a major
scale have higher danceability compared to those in minor
scale? What about track_popularity?# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)
# Plot the distribution of danceability, faceting by mode
danceability <- ggplot(spotify_songs_unique, aes(x = danceability)) +
geom_histogram() +
facet_grid(~mode) +
labs(title = "Distribution of danceability in Major and Minor scale",
x = "danceability",
y = "frequency")
danceability# Plot the distribution of track_popularity, faceting by mode
track_popularity <- ggplot(spotify_songs_unique, aes(x = track_popularity)) +
geom_histogram() +
facet_grid(~mode) +
labs(title = "Distribution of track_popularity in Major and Minor scale",
x = "track_popularity",
y = "frequency")
track_popularityThere are slightly more songs on major scale that are danceable. Even though the danceability plot looks very similar whether it is in major or minor, there is a greater number of songs in major, hence, major songs have slightly higher danceability.
Looking at the graphs for track popularity we can see that it is more or less the same. The distributions are very similar but if the scale is major there is a higher count of popular songs.
However, since the curves are extremely similar in both major or minor, having a different scale doesn’t really add more value to danceability or track popularity.
The purpose of this exercise is to reproduce a plot using your
dplyr and ggplot2 skills. It builds on
exercise 1, the San Francisco rentals data.
You have to create a graph that calculates the cumulative % change for 0-, 1-1, and 2-bed flats between 2000 and 2018 for the top twelve cities in Bay Area, by number of ads that appeared in Craigslist. Your final graph should look like this
# YOUR CODE GOES HERE
total_count <- as.numeric(count(rent))
# Retrieve the top 12 cities from the given dataset
top_12_cities <- rent %>%
group_by(city) %>%
summarise(total_count_city = n()) %>%
arrange(desc(total_count_city)) %>%
head(12)
# Find the median prices for the top 12 cities, grouping by city, beds and year.
city_rent <- rent %>%
filter(beds < 3, city %in% top_12_cities$city) %>%
group_by(city, beds, year) %>%
summarise(median_rental = median(price))
# Calculate the cumulative % change in the median rental prices.
final_solution <- city_rent %>%
group_by(city, beds) %>%
mutate(pct_change = (median_rental/lag(median_rental))) %>%
mutate(pct_change = ifelse(is.na(pct_change), 1, pct_change)) %>%
mutate(cumulative_change = cumprod(pct_change))
# Plot the findings on Line plot, faceting by beds and city.
ggplot(final_solution, aes(x=year, y=cumulative_change, color=city)) +
geom_line() +
facet_grid(beds ~ city) +
labs(title = "Cumulative % change in 0,1, and 2-bed rentals in Bay Area",
subtitle = "2000-2018",
x = NULL,
y = NULL) +
theme(legend.position="none", axis.text.x = element_text(angle = 90))# Make sure you use vroom() as it is significantly faster than read.csv()
CA_contributors_2016 <- vroom::vroom(here::here("data","CA_contributors_2016.csv"))
zip_code_database <- vroom::vroom(here::here("data","zip_code_database.csv"))
# Set zip columns in both datasets to character to prepare for join
CA_contributors_2016 <- CA_contributors_2016 %>%
mutate(zip = as.character(zip))
zip_code_database_new <- zip_code_database %>%
mutate(zip = as.character(zip))
# Join datasets with a left join
CA_contributors_2016 <- left_join(CA_contributors_2016, zip_code_database_new, by="zip")
# Create Hillary plot
hillary <- CA_contributors_2016 %>%
filter(cand_nm == "Clinton, Hillary Rodham") %>%
group_by(primary_city) %>%
summarise(total_amt = sum(contb_receipt_amt)) %>% # calculate total amount donated for Hillary in each city
slice_max(order_by = total_amt, n=10) %>% # take top 10
mutate(primary_city=fct_reorder(primary_city, total_amt)) %>% # order chart to in descending order
ggplot(aes(x = total_amt, y=primary_city)) + # create plot
geom_col(fill="blue") +
scale_x_continuous(labels = scales::dollar_format()) +
labs(title = "Where did candidates raise most money?", subtitle = "Clinton, Hillary Rodham")
# Create Donald plot
trump <- CA_contributors_2016 %>%
filter(cand_nm == "Trump, Donald J.") %>%
group_by(primary_city) %>%
summarise(total_amt = sum(contb_receipt_amt)) %>%
slice_max(order_by = total_amt, n=10) %>%
mutate(primary_city=fct_reorder(primary_city, total_amt)) %>%
ggplot(
mapping=aes(x = total_amt, y=primary_city)) +
geom_col(fill="red") +
scale_x_continuous(labels = scales::dollar_format()) +
labs(subtitle = "Trump, Donald J.")
# Join two plots together
hillary + theme_bw(base_size = 14) + labs(x = "Amount Raised", y = "Primary City") +
trump + theme_bw(base_size = 14) + labs(x = "Amount Raised", y = "Primary City")# Create a plot for the top 10 candidates. We are not sure whether that is required but it worked for 10 candidates so it will work for two candidates as well
# Find the top 10 candidates
top_ten <- CA_contributors_2016 %>%
group_by(cand_nm) %>%
summarise(total_amt = sum(contb_receipt_amt)) %>%
slice_max(order_by = total_amt, n=10)
# Filter for the top 10 candidates in the original dataset
CA_2016 <- CA_contributors_2016 %>%
filter(cand_nm == top_ten$cand_nm) %>%
group_by(cand_nm, primary_city) %>%
summarise(total = sum(contb_receipt_amt))
# This is where the magic happens
CA_2016 %>%
group_by(cand_nm) %>%
top_n(10) %>% # use function from tidytext
ungroup %>% # reverse grouping
mutate(primary_city = reorder_within(primary_city, total, cand_nm)) %>% # order the primary city column
ggplot(aes(primary_city, total, fill = cand_nm)) + # add plot
geom_col(show.legend = FALSE) +
facet_wrap(~cand_nm, scales = "free_y") + # create one plot for each candidate
coord_flip() +
scale_x_reordered() +
scale_y_continuous(expand = c(0,0)) + # include 0 in y axis
labs(title = "Top 10 candidates donations in the Bay Area",
x = "Total amount donated", y = "Cities")There is a lot of explanatory text, comments, etc. You do not need these, so delete them and produce a stand-alone document that you could share with someone. Knit the edited and completed R Markdown file as an HTML document (use the “Knit” button at the top of the script editor window) and upload it to Canvas.